library(lavaan)
library(dplyr)
library(reticulate)
library(marginaleffects)
library(modelsummary)
library(ggplot2)
library(tidyr)
library(egg)
library(lme4)
library(semPlot)
library(tinytable)
library(kableExtra)
library(reshape2)
reticulate::py_run_string("import pymc as pm")
options(rstudio.python.installationPath = "/Users/nathanielforde/mambaforge/envs")
options("modelsummary_factory_default" = "tinytable")
options(repr.plot.width=15, repr.plot.height=8)
knitr::knit_engines$set(python = reticulate::eng_python)
options(scipen=999)
set.seed(130)This is a work in progress. We intend to elaborate the choices related to analysing and understanding the data elicted through psychometrics surveys. Psychometric phenomena are inherently complex an imprecisely measured. Special attention needs to the assumptions built into the methodologies used to analyse this species of data.
Measurment and Measurement Constructs
Science is easier when there is a recipe. When there is some formula to follow, a procedure to adopt or routine to ape, you can out-source the responsibility for theory construction and methodological justification. One egregious pattern in this vein masks implausible nonsense with much vaunted mathematical gloss of “statistical significance”. Seen from 1000 feet, this is not surprising. Lip-service is paid to the idea of scientific method and we absolve ourselves of the requirement for principled justification and substantive argument. Evidence is marshaled in service to argument. It’s an absurd pretense to claim that data speaks for itself in this argument.
Data is found, gathered or maybe even carefully curated. In all cases there is need for a defensive posture, an argument that the data is fit-for-purpose. No where is this more clear than in psychometrics where the data is often derived from a strategically constructed survey aimed at a particular target phenomena. Some intuited, but not yet measured, concept that arguably plays a role in human action, motivation or sentiment. The “fuzziness” of the subject matter in psychometrics has had a catalyzing effect on the methodological rigour sought in the science. Survey designs are agonised over for correct tone and rhythm of sentence structure. Analysis steps are justified and tested under a wealth of modelling routines. Model architectures are defined and refined to better express the hypothesised structures in the data-generating process.
We will examine a smattering of choices available in the analysis of psychometric survey data. We will first step through these analysis routines using lmer, lavaan before switching to PyMC to demonstrate how to fit Confirmatory Factor Analysis models and Structural Equation Models in a Bayesian fashion using a probabilistic programming language.
The Data
The data is borrowed from work here demonstrating SEM modelling with Lavaan. We’ll load it up and begind some exploratory work.
df = read.csv('sem_data.csv')
df$ls_sum <- df$ls_p1 + df$ls_p2 + df$ls_p3
df$ls_mean <- rowMeans(df[ c('ls_p1', 'ls_p2', 'ls_p3')])
head(df) |> kable() |> kable_styling() |> kable_classic(full_width = F, html_font = "Cambria")| ID | region | gender | age | se_acad_p1 | se_acad_p2 | se_acad_p3 | se_social_p1 | se_social_p2 | se_social_p3 | sup_friends_p1 | sup_friends_p2 | sup_friends_p3 | sup_parents_p1 | sup_parents_p2 | sup_parents_p3 | ls_p1 | ls_p2 | ls_p3 | ls_sum | ls_mean |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | west | female | 13 | 4.857143 | 5.571429 | 4.500000 | 5.80 | 5.500000 | 5.40 | 6.5 | 6.5 | 7.0 | 7.0 | 7.0 | 6.0 | 5.333333 | 6.75 | 5.50 | 17.58333 | 5.861111 |
| 2 | west | male | 14 | 4.571429 | 4.285714 | 4.666667 | 5.00 | 5.500000 | 4.80 | 4.5 | 4.5 | 5.5 | 5.0 | 6.0 | 4.5 | 4.333333 | 5.00 | 4.50 | 13.83333 | 4.611111 |
| 10 | west | female | 14 | 4.142857 | 6.142857 | 5.333333 | 5.20 | 4.666667 | 6.00 | 4.0 | 4.5 | 3.5 | 7.0 | 7.0 | 6.5 | 6.333333 | 5.50 | 4.00 | 15.83333 | 5.277778 |
| 11 | west | female | 14 | 5.000000 | 5.428571 | 4.833333 | 6.40 | 5.833333 | 6.40 | 7.0 | 7.0 | 7.0 | 7.0 | 7.0 | 7.0 | 4.333333 | 6.50 | 6.25 | 17.08333 | 5.694444 |
| 12 | west | female | 14 | 5.166667 | 5.600000 | 4.800000 | 5.25 | 5.400000 | 5.25 | 7.0 | 7.0 | 7.0 | 6.5 | 6.5 | 7.0 | 5.666667 | 6.00 | 5.75 | 17.41667 | 5.805556 |
| 14 | west | male | 14 | 4.857143 | 4.857143 | 4.166667 | 5.20 | 5.000000 | 4.20 | 5.5 | 6.5 | 7.0 | 6.5 | 6.5 | 6.5 | 5.000000 | 5.50 | 5.50 | 16.00000 | 5.333333 |
The hypothetical dependency structure in this life-satisfaction data set posits a moderations relationship between scores related to life-satisfaction, parental and family support and self-efficacy. It is not a trivial task to be able to design a survey that can elicit answers plausibly mapped to each of these “factors” or themes, never mind finding a model of their relationship that can inform us as to the relative of impact of each on life-satisfaction outcomes.
We can pull out the high level summary statistics to better observe the level of information in our metrics. These metrics are thematically clustered because the source survey deliberately targetted each of the underlying themes.
#| code-fold: true
#| code-summary: "Show the code"
datasummary_skim(df)|>
style_tt(
i = 15:17,
j = 1:1,
background = "#20AACC",
color = "white",
italic = TRUE) |>
style_tt(
i = 18:19,
j = 1:1,
background = "#2888A0",
color = "white",
italic = TRUE) |>
style_tt(
i = 2:14,
j = 1:1,
background = "#17C2AD",
color = "white",
italic = TRUE)| Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram | |
|---|---|---|---|---|---|---|---|---|
| ID | 283 | 0 | 187.9 | 106.3 | 1.0 | 201.0 | 367.0 | |
| age | 5 | 0 | 14.7 | 0.8 | 13.0 | 15.0 | 17.0 | |
| se_acad_p1 | 32 | 0 | 5.2 | 0.8 | 3.1 | 5.1 | 7.0 | |
| se_acad_p2 | 36 | 0 | 5.3 | 0.7 | 3.1 | 5.4 | 7.0 | |
| se_acad_p3 | 29 | 0 | 5.2 | 0.8 | 2.8 | 5.2 | 7.0 | |
| se_social_p1 | 24 | 0 | 5.3 | 0.8 | 1.8 | 5.4 | 7.0 | |
| se_social_p2 | 27 | 0 | 5.5 | 0.7 | 2.7 | 5.5 | 7.0 | |
| se_social_p3 | 31 | 0 | 5.4 | 0.8 | 3.0 | 5.5 | 7.0 | |
| sup_friends_p1 | 13 | 0 | 5.8 | 1.1 | 1.0 | 6.0 | 7.0 | |
| sup_friends_p2 | 10 | 0 | 6.0 | 0.9 | 2.5 | 6.0 | 7.0 | |
| sup_friends_p3 | 13 | 0 | 6.0 | 1.1 | 1.0 | 6.0 | 7.0 | |
| sup_parents_p1 | 11 | 0 | 6.0 | 1.1 | 1.0 | 6.0 | 7.0 | |
| sup_parents_p2 | 11 | 0 | 5.9 | 1.1 | 2.0 | 6.0 | 7.0 | |
| sup_parents_p3 | 13 | 0 | 5.7 | 1.1 | 1.0 | 6.0 | 7.0 | |
| ls_p1 | 15 | 0 | 5.2 | 0.9 | 2.0 | 5.3 | 7.0 | |
| ls_p2 | 21 | 0 | 5.8 | 0.7 | 2.5 | 5.8 | 7.0 | |
| ls_p3 | 22 | 0 | 5.2 | 0.9 | 2.0 | 5.2 | 7.0 | |
| ls_sum | 98 | 0 | 16.2 | 2.1 | 8.7 | 16.4 | 20.8 | |
| ls_mean | 97 | 0 | 5.4 | 0.7 | 2.9 | 5.5 | 6.9 | |
| N | % | |||||||
| region | east | 142 | 50.2 | |||||
| west | 141 | 49.8 | ||||||
| gender | female | 132 | 46.6 | |||||
| male | 151 | 53.4 |
Note how we’ve distinguished among the metrics for the “outcome” metrics and the “driver” metrics. Such a distinction may seem trivial, but it is only possible because we bring substantive knowledge to bear on the problem in the design of our survey and the postulation of the theoretical construct. Our data is a multivariate outcome with a large range of possible interacting effects and the patterns of realisation for our life-satisfaction scores may be quite different than the hypothesised structure. It is this open question that we’re aiming to uncover in the analysis of psychometrics surveys.
Sample Covariances and Correlations
drivers = c('se_acad_p1', 'se_acad_p2', 'se_acad_p3', 'se_social_p1', 'se_social_p2', 'se_social_p3', 'sup_friends_p1','sup_friends_p2', 'sup_friends_p3', 'sup_parents_p1' , 'sup_parents_p2' , 'sup_parents_p3', 'ls_p1', 'ls_p2', 'ls_p3')
plot_heatmap <- function(df, title="Sample Covariances", subtitle="Observed Measures") {
heat_df = df |> as.matrix() |> melt()
colnames(heat_df) <- c("x", "y", "value")
g <- heat_df |> mutate(value = round(value, 2)) |> ggplot(aes(x = x, y = y, fill = value)) +
geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
scale_fill_gradient2(
high = 'dodgerblue4',
mid = 'white',
low = 'firebrick2'
) + theme(axis.text.x = element_text(angle=45)) + ggtitle(title, subtitle)
g
}
g1 = plot_heatmap(cov(df[, drivers]))
g2 = plot_heatmap(cor(df[, drivers]), "Sample Correlations")
plot <- ggarrange(g1,g2, ncol=1, nrow=2);Note the relatively strong correlations between measures of parental support and the life-satisfaction outcome ls_p3. Similarly, how the social self-efficacy scores seem similarly correlated with the secondary life satisfaction indicator ls_p2. These observed correlations merit some further investigation. We can plot the pairs of scatter plots.
df <- df |>
mutate(id = row_number())
# Prepare data to be plotted on the x axis
x_vars <- pivot_longer(data = df,
cols = se_acad_p1:ls_p3,
names_to = "variable_x",
values_to = "x")
# Prepare data to be plotted on the y axis
y_vars <- pivot_longer(data = df,
cols = se_acad_p1:ls_p3,
names_to = "variable_y",
values_to = "y")
# Join data for x and y axes and make plot
full_join(x_vars, y_vars,
by = c("id"),
relationship = "many-to-many") |>
ggplot() +
aes(x = x, y = y) +
geom_point() +
facet_grid(c("variable_x", "variable_y")) + ggtitle("Pair Plot of Indicator Metrics",
"Comparing Against Life Satisfaction Scores")The scatter plots among the highly correlated variables in the heatmap do seem to exhibit some kind of linear relationship with aspects of the life-satisfaction scores. We now turn to modelling these relationships to tease out some kind of inferential summary of that relationship.
Fit Initial Regression Models
To model these relationships we make use of the aggregated sum and mean scores to model the relationships between these indicator metrics and life-satisfaction. We fit a variety of models each one escalating in the number of indicator metrics we incorporate into our model of the life-satisfaction outcome. This side-steps the multivariate nature of hypothesised constructs and crudely amalgamates the indicator metrics. This may be more or less justified depending on how similar in theme the three outcome questions ls_p1, ls_p2, ls_p3 are in nature.
formula_sum_1st = " ls_sum ~ se_acad_p1 + se_social_p1 + sup_friends_p1 + sup_parents_p1"
formula_mean_1st = " ls_mean ~ se_acad_p1 + se_social_p1 + sup_friends_p1 + sup_parents_p1"
formula_sum_12 = " ls_sum ~ se_acad_p1 + se_acad_p2 + se_social_p1 + se_social_p2 +
sup_friends_p1 + sup_friends_p2 + sup_parents_p1 + sup_parents_p2"
formula_mean_12 = " ls_mean ~ se_acad_p1 + se_acad_p2 + se_social_p1 + se_social_p2 +
sup_friends_p1 + sup_friends_p2 + sup_parents_p1 + sup_parents_p2"
formula_sum = " ls_sum ~ se_acad_p1 + se_acad_p2 + se_acad_p3 + se_social_p1 + se_social_p2 + se_social_p3 + sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + sup_parents_p1 + sup_parents_p2 + sup_parents_p3"
formula_mean = " ls_mean ~ se_acad_p1 + se_acad_p2 + se_acad_p3 + se_social_p1 + se_social_p2 + se_social_p3 + sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + sup_parents_p1 + sup_parents_p2 + sup_parents_p3"
mod_sum = lm(formula_sum, df)
mod_sum_1st = lm(formula_sum_1st, df)
mod_sum_12 = lm(formula_sum_12, df)
mod_mean = lm(formula_mean, df)
mod_mean_1st = lm(formula_mean_1st, df)
mod_mean_12 = lm(formula_mean_12, df)
min_max_norm <- function(x) {
(x - min(x)) / (max(x) - min(x))
}
df_norm <- as.data.frame(lapply(df[c(5:19)], min_max_norm))
df_norm$ls_sum <- df$ls_sum
df_norm$ls_mean <- df$ls_mean
mod_sum_norm = lm(formula_sum, df_norm)
mod_mean_norm = lm(formula_mean, df_norm)
models = list(
"Outcome: sum_score" = list("model_sum_1st_factors" = mod_sum_1st,
"model_sum_1st_2nd_factors" = mod_sum_12,
"model_sum_score" = mod_sum,
"model_sum_score_norm" = mod_sum_norm
),
"Outcome: mean_score" = list(
"model_mean_1st_factors" = mod_mean_1st,
"model_mean_1st_2nd_factors" = mod_mean_12,
"model_mean_score"= mod_mean,
"model_mean_score_norm" = mod_mean_norm
)
)The classical presentation of regression models reports the coefficient weights accorded to each of the input variables. We present these models to highlight that the manner in which we represent our theoretical constructs has ramifications for the interpretation of the data generating process. In particular, note how different degrees of significance are accorded to the different variables depending on which variables are included.
#| code-fold: true
#| code-summary: "Show the code"
modelsummary(models, stars=TRUE, shape ="cbind") |>
style_tt(
i = 2:25,
j = 1:1,
background = "#17C2AD",
color = "white",
italic = TRUE)| Outcome: sum_score | Outcome: mean_score | |||||||
|---|---|---|---|---|---|---|---|---|
| model_sum_1st_factors | model_sum_1st_2nd_factors | model_sum_score | model_sum_score_norm | model_mean_1st_factors | model_mean_1st_2nd_factors | model_mean_score | model_mean_score_norm | |
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||||||||
| (Intercept) | 5.118*** | 2.644** | 2.094* | 7.783*** | 1.706*** | 0.881** | 0.698* | 2.594*** |
| (0.907) | (0.985) | (0.954) | (0.658) | (0.302) | (0.328) | (0.318) | (0.219) | |
| se_acad_p1 | 0.242 | -0.034 | -0.208 | -0.800 | 0.081 | -0.011 | -0.069 | -0.267 |
| (0.147) | (0.180) | (0.192) | (0.742) | (0.049) | (0.060) | (0.064) | (0.247) | |
| se_social_p1 | 1.088*** | 0.501* | 0.355+ | 1.846+ | 0.363*** | 0.167* | 0.118+ | 0.615+ |
| (0.162) | (0.204) | (0.200) | (1.039) | (0.054) | (0.068) | (0.067) | (0.346) | |
| sup_friends_p1 | 0.125 | -0.224+ | -0.272+ | -1.630+ | 0.042 | -0.075+ | -0.091+ | -0.543+ |
| (0.088) | (0.133) | (0.150) | (0.901) | (0.029) | (0.044) | (0.050) | (0.300) | |
| sup_parents_p1 | 0.561*** | 0.238+ | 0.072 | 0.432 | 0.187*** | 0.079+ | 0.024 | 0.144 |
| (0.100) | (0.141) | (0.143) | (0.858) | (0.033) | (0.047) | (0.048) | (0.286) | |
| se_acad_p2 | 0.448* | 0.327 | 1.261 | 0.149* | 0.109 | 0.420 | ||
| (0.197) | (0.202) | (0.779) | (0.066) | (0.067) | (0.260) | |||
| se_social_p2 | 0.756*** | 0.509* | 2.206* | 0.252*** | 0.170* | 0.735* | ||
| (0.213) | (0.219) | (0.949) | (0.071) | (0.073) | (0.316) | |||
| sup_friends_p2 | 0.369* | 0.331* | 1.490* | 0.123* | 0.110* | 0.497* | ||
| (0.157) | (0.160) | (0.720) | (0.052) | (0.053) | (0.240) | |||
| sup_parents_p2 | 0.370** | 0.118 | 0.591 | 0.123** | 0.039 | 0.197 | ||
| (0.138) | (0.144) | (0.719) | (0.046) | (0.048) | (0.240) | |||
| se_acad_p3 | 0.153 | 0.637 | 0.051 | 0.212 | ||||
| (0.174) | (0.726) | (0.058) | (0.242) | |||||
| se_social_p3 | 0.443** | 1.771** | 0.148** | 0.590** | ||||
| (0.161) | (0.642) | (0.054) | (0.214) | |||||
| sup_friends_p3 | 0.165 | 0.989 | 0.055 | 0.330 | ||||
| (0.130) | (0.782) | (0.043) | (0.261) | |||||
| sup_parents_p3 | 0.526*** | 3.158*** | 0.175*** | 1.053*** | ||||
| (0.126) | (0.754) | (0.042) | (0.251) | |||||
| Num.Obs. | 283 | 283 | 283 | 283 | 283 | 283 | 283 | 283 |
| R2 | 0.399 | 0.467 | 0.517 | 0.517 | 0.399 | 0.467 | 0.517 | 0.517 |
| R2 Adj. | 0.390 | 0.451 | 0.496 | 0.496 | 0.390 | 0.451 | 0.496 | 0.496 |
| AIC | 1090.9 | 1064.7 | 1044.7 | 1044.7 | 469.1 | 442.9 | 422.9 | 422.9 |
| BIC | 1112.8 | 1101.2 | 1095.7 | 1095.7 | 491.0 | 479.4 | 473.9 | 473.9 |
| Log.Lik. | -539.455 | -522.373 | -508.341 | -508.341 | -228.548 | -211.466 | -197.434 | -197.434 |
| RMSE | 1.63 | 1.53 | 1.46 | 1.46 | 0.54 | 0.51 | 0.49 | 0.49 |
We can similarly plot the coefficient values and their uncertainty highlighting how the representation or scaling of the variables impact the scale of the coefficient weights and therefore the surety of any subsequent claims.
models = list(
"model_sum_1st_factors" = mod_sum_1st,
"model_sum_1st_2nd_factors" = mod_sum_12,
"model_sum_score" = mod_sum,
"model_sum_score_norm" = mod_sum_norm,
"model_mean_1st_factors" = mod_mean_1st,
"model_mean_1st_2nd_factors" = mod_mean_12,
"model_mean_score"= mod_mean,
"model_mean_score_norm" = mod_mean_norm
)
modelplot(models, coef_omit = 'Intercept') + geom_vline(xintercept = 0, linetype="dotted",
color = "black") + ggtitle("Comparing Model Parameter Estimates", "Across Covariates")Significant Coefficients?
An alternative lens on these figures highlights the statistical significance of the coefficients. But again, these criteria are much abused. Significance at what level? Conditional on which representation?
g1 = modelplot(mod_sum, coef_omit = 'Intercept') + aes(color = ifelse(p.value < 0.001, "Significant at 0.001", "Not significant at 0.001")) +
scale_color_manual(values = c("grey", "blue")) + ggtitle("Significance of Coefficient Values", "At Different Levels")
g2 = modelplot(mod_mean, coef_omit = 'Intercept') + aes(color = ifelse(p.value < 0.05, "Significant at 0.05", "Not significant at 0.05")) +
scale_color_manual(values = c("grey", "black"))
plot <- ggarrange(g1,g2, ncol=2, nrow=1);Aggregate Driver Scores
Perhaps we play with the feature representation and increase the proportion of significant indicators. Can we now tell a more definitive story about how parental support and social self-efficact are determinants of self-reported life-satisfaction scores? Let’s focus here on the sum score representation and add interaction effects.
df$se_acad_mean <- rowMeans(df[c('se_acad_p1', 'se_acad_p2', 'se_acad_p3')])
df$se_social_mean <- rowMeans(df[c('se_social_p1', 'se_social_p2', 'se_social_p3')])
df$sup_friends_mean <- rowMeans(df[c('sup_friends_p1', 'sup_friends_p2', 'sup_friends_p3')])
df$sup_parents_mean <- rowMeans(df[c('sup_parents_p1', 'sup_parents_p2', 'sup_parents_p3')])
formula_parcel_sum = "ls_sum ~ se_acad_mean + se_social_mean +
sup_friends_mean + sup_parents_mean " #sup_parents_mean*se_social_mean"
formula_parcel_sum_inter = "ls_sum ~ se_acad_mean + se_social_mean +
sup_friends_mean + sup_parents_mean + sup_parents_mean*se_social_mean"
mod_sum_parcel = lm(formula_parcel_sum, df)
mod_sum_inter_parcel = lm(formula_parcel_sum_inter, df)
models_parcel = list("model_sum_score" = mod_sum_parcel,
"model_sum_inter_score"= mod_sum_inter_parcel
)
modelsummary(models_parcel, stars=TRUE)| model_sum_score | model_sum_inter_score | |
|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||
| (Intercept) | 2.728** | -6.103+ |
| (0.931) | (3.356) | |
| se_acad_mean | 0.307+ | 0.370* |
| (0.158) | (0.158) | |
| se_social_mean | 1.269*** | 2.859*** |
| (0.175) | (0.606) | |
| sup_friends_mean | 0.124 | 0.183+ |
| (0.097) | (0.099) | |
| sup_parents_mean | 0.726*** | 2.242*** |
| (0.099) | (0.562) | |
| se_social_mean × sup_parents_mean | -0.292** | |
| (0.107) | ||
| Num.Obs. | 283 | 283 |
| R2 | 0.489 | 0.503 |
| R2 Adj. | 0.482 | 0.494 |
| AIC | 1044.6 | 1039.0 |
| BIC | 1066.4 | 1064.5 |
| Log.Lik. | -516.288 | -512.513 |
| RMSE | 1.50 | 1.48 |
What does definitive mean here? Is it so simple as more significant coefficents? Marginally better performance measures?
g1 = modelplot(mod_sum_parcel, coef_omit = 'Intercept') + aes(color = ifelse(p.value < 0.05, "Significant at 0.05", "Not significant at 0.05")) +
scale_color_manual(values = c("grey", "blue")) + ggtitle("Significance of Coefficient Values", "At Different Levels for Sum and Mean Scores Life Satisfaction ")
g2 = modelplot(mod_sum_inter_parcel, coef_omit = 'Intercept') + aes(color = ifelse(p.value < 0.05, "Significant at 0.05", "Not significant at 0.05")) +
scale_color_manual(values = c("grey", "blue"))
plot <- ggarrange(g1,g2, ncol=2, nrow=1);This kind of brinkmanship is brittle. Any one of these kinds of choice can be justified but more often than not results from an suspect exploratory process. Steps down a garden of forking paths seeking some kind of story to justify an analysis or promote a conclusion. This post-hoc “seeking” is just bad science undermining the significance claims that accrue to reliable procedures. It warps the nature of testing procedure by corrupting the assumed consistency of repeatable trials. The guarantees of statistical significance attach to a conclusion just when the procedure is imagined replicable and repeated under identical conditions. By exploring the different representations and narrative criteria of adequacy we break those guarantees.
Exploratory and Confirmatory Modes
One of the things that psychometrics has pioneered well is the distinction between an exploratory and confirmatory models. This distinction, when made explicit, partially guards against the abuse of inferential integrity we see in more common work-flows. But additionally, models are often opaque - you may (as above) have improved some measure of model fit, changed the parameter weighting accorded to an observed feature, but that does that mean? Exploration of model architectures, design choices and feature creation is just how we come to understand the meaning of a model specification. Even in the simple case of regression we’ve seen how adding interaction terms changes the interpretability of a model. How then are we to stand behind uncertainty estimates accorded to parameter weights when we barely intuit the implications of a model design?
Marginal Effects
Evaluating inferences
pred <- predictions(mod_sum_parcel, newdata = datagrid(sup_parents_mean = 1:10 ))
pred1 <- predictions(mod_sum_inter_parcel, newdata = datagrid(sup_parents_mean = 1:10))
pred1 |> head() |> kableExtra::kable() |> kable_styling() |> kable_classic(full_width = F, html_font = "Cambria")| rowid | estimate | std.error | statistic | p.value | s.value | conf.low | conf.high | se_acad_mean | se_social_mean | sup_friends_mean | sup_parents_mean | ls_sum |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 13.03023 | 0.5060063 | 25.75112 | 0 | 483.3544 | 12.03847 | 14.02198 | 5.237686 | 5.402768 | 5.929329 | 1 | 17.58333 |
| 2 | 13.69527 | 0.4074347 | 33.61341 | 0 | 820.4207 | 12.89671 | 14.49383 | 5.237686 | 5.402768 | 5.929329 | 2 | 17.58333 |
| 3 | 14.36031 | 0.3102480 | 46.28656 | 0 | Inf | 13.75224 | 14.96839 | 5.237686 | 5.402768 | 5.929329 | 3 | 17.58333 |
| 4 | 15.02536 | 0.2163209 | 69.45865 | 0 | Inf | 14.60137 | 15.44934 | 5.237686 | 5.402768 | 5.929329 | 4 | 17.58333 |
| 5 | 15.69040 | 0.1327619 | 118.18448 | 0 | Inf | 15.43019 | 15.95061 | 5.237686 | 5.402768 | 5.929329 | 5 | 17.58333 |
| 6 | 16.35544 | 0.0935245 | 174.87863 | 0 | Inf | 16.17214 | 16.53875 | 5.237686 | 5.402768 | 5.929329 | 6 | 17.58333 |
pred |> head() |> kableExtra::kable() |> kable_styling() |> kable_classic(full_width = F, html_font = "Cambria")| rowid | estimate | std.error | statistic | p.value | s.value | conf.low | conf.high | se_acad_mean | se_social_mean | sup_friends_mean | sup_parents_mean | ls_sum |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 12.65390 | 0.4926138 | 25.68725 | 0 | 480.9813 | 11.68839 | 13.61940 | 5.237686 | 5.402768 | 5.929329 | 1 | 17.58333 |
| 2 | 13.37987 | 0.3953306 | 33.84477 | 0 | 831.6885 | 12.60504 | 14.15471 | 5.237686 | 5.402768 | 5.929329 | 2 | 17.58333 |
| 3 | 14.10585 | 0.2994218 | 47.11030 | 0 | Inf | 13.51899 | 14.69271 | 5.237686 | 5.402768 | 5.929329 | 3 | 17.58333 |
| 4 | 14.83183 | 0.2068081 | 71.71783 | 0 | Inf | 14.42649 | 15.23716 | 5.237686 | 5.402768 | 5.929329 | 4 | 17.58333 |
| 5 | 15.55780 | 0.1250396 | 124.42301 | 0 | Inf | 15.31273 | 15.80288 | 5.237686 | 5.402768 | 5.929329 | 5 | 17.58333 |
| 6 | 16.28378 | 0.0908259 | 179.28572 | 0 | Inf | 16.10576 | 16.46180 | 5.237686 | 5.402768 | 5.929329 | 6 | 17.58333 |
Regression Marginal Effects
g = plot_predictions(mod_sum_parcel, condition = c("se_social_mean"), type = "response") + ggtitle("Counterfactual Shift of Outcome: se_social_mean", "Holding all else Fixed")
g1 = plot_predictions(mod_sum_inter_parcel, condition = c("se_social_mean"), type = "response") + ggtitle("Counterfactual Shift of Outcome: se_social_mean", "Holding all else Fixed Interaction Model")
plot <- ggarrange(g,g1, ncol=1, nrow=2);Hierarchical Models
formula_hierarchy_mean = "ls_mean ~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3 + sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + se_acad_p1 + se_acad_p2 + se_acad_p3 +
se_social_p1 + se_social_p2 + se_social_p3 + (1 | region)"
formula_hierarchy_sum = "ls_sum ~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3 + sup_friends_p1 + sup_friends_p2 + sup_friends_p3 + se_acad_p1 + se_acad_p2 + se_acad_p3 +
se_social_p1 + se_social_p2 + se_social_p3 + (1 | region)"
hierarchical_mean_fit <- lmer(formula_hierarchy_mean, data = df, REML = TRUE)boundary (singular) fit: see help('isSingular')
hierarchical_sum_fit <- lmer(formula_hierarchy_sum, data = df, REML = TRUE)boundary (singular) fit: see help('isSingular')
g1 = modelplot(hierarchical_mean_fit, re.form=NA) + aes(color = ifelse(p.value < 0.001, "Significant at 0.001", "Not significant at 0.001")) +
scale_color_manual(values = c("grey", "blue")) + ggtitle("Significance of Coefficient Values", "At Different Levels")
g2 = modelplot(hierarchical_sum_fit, re.form=NA) + aes(color = ifelse(p.value < 0.05, "Significant at 0.05", "Not significant 0.05")) +
scale_color_manual(values = c("grey", "black"))
plot <- ggarrange(g1,g2, ncol=2, nrow=1);Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
modelsummary(list("hierarchical_mean_fit"= hierarchical_mean_fit,
"hierarchical_sum_fit"= hierarchical_sum_fit),
stars = TRUE) |>
style_tt(
i = 2:25,
j = 1:1,
background = "#17C2AD",
color = "white",
italic = TRUE)| hierarchical_mean_fit | hierarchical_sum_fit | |
|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||
| (Intercept) | 0.698* | 2.094* |
| (0.318) | (0.954) | |
| sup_parents_p1 | 0.024 | 0.072 |
| (0.048) | (0.143) | |
| sup_parents_p2 | 0.039 | 0.118 |
| (0.048) | (0.144) | |
| sup_parents_p3 | 0.175*** | 0.526*** |
| (0.042) | (0.126) | |
| sup_friends_p1 | -0.091+ | -0.272+ |
| (0.050) | (0.150) | |
| sup_friends_p2 | 0.110* | 0.331* |
| (0.053) | (0.160) | |
| sup_friends_p3 | 0.055 | 0.165 |
| (0.043) | (0.130) | |
| se_acad_p1 | -0.069 | -0.208 |
| (0.064) | (0.192) | |
| se_acad_p2 | 0.109 | 0.327 |
| (0.067) | (0.202) | |
| se_acad_p3 | 0.051 | 0.153 |
| (0.058) | (0.174) | |
| se_social_p1 | 0.118+ | 0.355+ |
| (0.067) | (0.200) | |
| se_social_p2 | 0.170* | 0.509* |
| (0.073) | (0.219) | |
| se_social_p3 | 0.148** | 0.443** |
| (0.054) | (0.161) | |
| SD (Intercept region) | 0.000 | 0.000 |
| SD (Observations) | 0.498 | 1.493 |
| Num.Obs. | 283 | 283 |
| R2 Marg. | 0.506 | 0.506 |
| AIC | 482.6 | 1075.9 |
| BIC | 537.3 | 1130.6 |
| RMSE | 0.49 | 1.46 |
Hierarchical Marginal Effects
g = plot_predictions(hierarchical_mean_fit, condition = c("sup_parents_p3", "region"), type = "response", re.form=NA) + ggtitle("Counterfactual Shift of Outcome: sup_parents_p3", "Holding all else Fixed")
g1 = plot_predictions(hierarchical_mean_fit, condition = c("sup_friends_p1", "region"), type = "response", re.form=NA) + ggtitle("Counterfactual Shift of Outcome: sup_friends_p1", "Holding all else Fixed")
g2 = plot_predictions(hierarchical_mean_fit, condition = c("se_acad_p1", "region"),
type = "response", re.form=NA) + ggtitle("Counterfactual Shift of Outcome: se_acad_p1", "Holding all else Fixed")
plot <- ggarrange(g,g1,g2, ncol=1, nrow=3);Confirmatory Factor Analysis
model_measurement <- "
# Measurement model
SUP_Parents =~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3
SUP_Friends =~ sup_friends_p1 + sup_friends_p2 + sup_friends_p3
SE_Academic =~ se_acad_p1 + se_acad_p2 + se_acad_p3
SE_Social =~ se_social_p1 + se_social_p2 + se_social_p3
LS =~ ls_p1 + ls_p2 + ls_p3
"
model_measurement1 <- "
# Measurement model
SUP_Parents =~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3
SUP_Friends =~ a1*sup_friends_p1 + a2*sup_friends_p2 + a3*sup_friends_p3
SE_Academic =~ se_acad_p1 + se_acad_p2 + se_acad_p3
SE_Social =~ se_social_p1 + se_social_p2 + se_social_p3
LS =~ ls_p1 + ls_p2 + ls_p3
a1 == a2
a1 == a3
"
fit_mod <- cfa(model_measurement, data = df)
fit_mod_1<- cfa(model_measurement1, data = df)
cfa_models = list("full_measurement_model" = fit_mod,
"measurement_model_reduced" = fit_mod_1)
modelplot(cfa_models)Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_segment()`).
summary(fit_mod, fit.measures = TRUE, standardized = TRUE) lavaan 0.6-18 ended normally after 49 iterations
Estimator ML
Optimization method NLMINB
Number of model parameters 40
Number of observations 283
Model Test User Model:
Test statistic 223.992
Degrees of freedom 80
P-value (Chi-square) 0.000
Model Test Baseline Model:
Test statistic 2696.489
Degrees of freedom 105
P-value 0.000
User Model versus Baseline Model:
Comparative Fit Index (CFI) 0.944
Tucker-Lewis Index (TLI) 0.927
Loglikelihood and Information Criteria:
Loglikelihood user model (H0) -4285.972
Loglikelihood unrestricted model (H1) -4173.976
Akaike (AIC) 8651.944
Bayesian (BIC) 8797.761
Sample-size adjusted Bayesian (SABIC) 8670.921
Root Mean Square Error of Approximation:
RMSEA 0.080
90 Percent confidence interval - lower 0.067
90 Percent confidence interval - upper 0.092
P-value H_0: RMSEA <= 0.050 0.000
P-value H_0: RMSEA >= 0.080 0.500
Standardized Root Mean Square Residual:
SRMR 0.056
Parameter Estimates:
Standard errors Standard
Information Expected
Information saturated (h1) model Structured
Latent Variables:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
SUP_Parents =~
sup_parents_p1 1.000 0.935 0.873
sup_parents_p2 1.036 0.056 18.613 0.000 0.969 0.887
sup_parents_p3 0.996 0.059 16.754 0.000 0.932 0.816
SUP_Friends =~
sup_friends_p1 1.000 1.021 0.906
sup_friends_p2 0.792 0.043 18.463 0.000 0.809 0.857
sup_friends_p3 0.891 0.050 17.741 0.000 0.910 0.831
SE_Academic =~
se_acad_p1 1.000 0.695 0.878
se_acad_p2 0.809 0.050 16.290 0.000 0.562 0.820
se_acad_p3 0.955 0.058 16.500 0.000 0.664 0.829
SE_Social =~
se_social_p1 1.000 0.638 0.843
se_social_p2 0.967 0.056 17.248 0.000 0.617 0.885
se_social_p3 0.928 0.067 13.880 0.000 0.592 0.741
LS =~
ls_p1 1.000 0.667 0.718
ls_p2 0.778 0.074 10.498 0.000 0.519 0.712
ls_p3 0.968 0.090 10.730 0.000 0.645 0.732
Covariances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
SUP_Parents ~~
SUP_Friends 0.132 0.064 2.073 0.038 0.138 0.138
SE_Academic 0.218 0.046 4.727 0.000 0.336 0.336
SE_Social 0.282 0.045 6.224 0.000 0.472 0.472
LS 0.405 0.057 7.132 0.000 0.650 0.650
SUP_Friends ~~
SE_Academic 0.071 0.047 1.493 0.136 0.100 0.100
SE_Social 0.196 0.046 4.281 0.000 0.301 0.301
LS 0.175 0.051 3.445 0.001 0.257 0.257
SE_Academic ~~
SE_Social 0.271 0.036 7.493 0.000 0.611 0.611
LS 0.238 0.039 6.065 0.000 0.514 0.514
SE_Social ~~
LS 0.321 0.042 7.659 0.000 0.755 0.755
Variances:
Estimate Std.Err z-value P(>|z|) Std.lv Std.all
.sup_parents_p1 0.273 0.037 7.358 0.000 0.273 0.238
.sup_parents_p2 0.255 0.038 6.738 0.000 0.255 0.213
.sup_parents_p3 0.437 0.048 9.201 0.000 0.437 0.335
.sup_friends_p1 0.227 0.040 5.656 0.000 0.227 0.179
.sup_friends_p2 0.238 0.030 7.936 0.000 0.238 0.266
.sup_friends_p3 0.371 0.042 8.809 0.000 0.371 0.310
.se_acad_p1 0.144 0.022 6.593 0.000 0.144 0.229
.se_acad_p2 0.153 0.018 8.621 0.000 0.153 0.327
.se_acad_p3 0.200 0.024 8.372 0.000 0.200 0.313
.se_social_p1 0.166 0.020 8.134 0.000 0.166 0.290
.se_social_p2 0.106 0.016 6.542 0.000 0.106 0.217
.se_social_p3 0.288 0.028 10.132 0.000 0.288 0.451
.ls_p1 0.417 0.045 9.233 0.000 0.417 0.484
.ls_p2 0.261 0.028 9.321 0.000 0.261 0.492
.ls_p3 0.362 0.040 9.005 0.000 0.362 0.465
SUP_Parents 0.875 0.098 8.910 0.000 1.000 1.000
SUP_Friends 1.042 0.111 9.407 0.000 1.000 1.000
SE_Academic 0.483 0.054 8.880 0.000 1.000 1.000
SE_Social 0.407 0.048 8.403 0.000 1.000 1.000
LS 0.444 0.069 6.394 0.000 1.000 1.000
g1 = plot_heatmap(cov(df[, drivers]))
g2 = plot_heatmap(data.frame(fitted(fit_mod)$cov), title="Model Implied Covariances", "Fitted Values")
plot <- ggarrange(g1,g2, ncol=1, nrow=2);Modification Indices
modindices(fit_mod, sort = TRUE, maximum.number = 6, standardized = FALSE, minimum.value = 5) lhs op rhs mi epc
57 SUP_Parents =~ ls_p3 28.086 0.411
152 sup_friends_p1 ~~ se_social_p3 18.792 0.094
64 SUP_Friends =~ se_social_p1 16.789 -0.134
60 SUP_Friends =~ sup_parents_p3 16.443 -0.189
68 SUP_Friends =~ ls_p2 15.119 0.155
210 ls_p2 ~~ ls_p3 13.346 -0.105
Summary Global Fit Measures
summary_df = cbind(fitMeasures(fit_mod, c("chisq", "baseline.chisq", "cfi", "aic", "bic", "rmsea","srmr")),
fitMeasures(fit_mod_1, c("chisq", "baseline.chisq", "cfi", "aic", "bic", "rmsea","srmr")))
colnames(summary_df) = c('Full Model', 'Reduced Model')
summary_df |> kable() |> kable_styling() |> kable_classic(full_width = F, html_font = "Cambria")| Full Model | Reduced Model | |
|---|---|---|
| chisq | 223.9922306 | 242.1450133 |
| baseline.chisq | 2696.4887420 | 2696.4887420 |
| cfi | 0.9444365 | 0.9382035 |
| aic | 8651.9435210 | 8666.0963037 |
| bic | 8797.7613969 | 8804.6232858 |
| rmsea | 0.0797501 | 0.0830724 |
| srmr | 0.0558656 | 0.0587591 |
semPlot::semPaths(fit_mod, whatLabels = 'est', intercepts = FALSE)semPlot::semPaths(fit_mod_1, whatLabels = 'std', intercepts = FALSE)Comparing the empirical and implied variance-covariance matrix
heat_df = data.frame(resid(fit_mod, type = "standardized")$cov)
heat_df = heat_df |> as.matrix() |> melt()
colnames(heat_df) <- c("x", "y", "value")
g1 = heat_df |> mutate(value = round(value, 2)) |> ggplot(aes(x = x, y = y, fill = value)) +
geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
scale_fill_gradient2(
high = 'dodgerblue4',
mid = 'white',
low = 'firebrick2'
) + theme(axis.text.x = element_text(angle=45)) + ggtitle("Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Model fit: Full Model")
heat_df = data.frame(resid(fit_mod_1, type = "standardized")$cov)
heat_df = heat_df |> as.matrix() |> melt()
colnames(heat_df) <- c("x", "y", "value")
g2 = heat_df |> mutate(value = round(value, 2)) |> ggplot(aes(x = x, y = y, fill = value)) +
geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
scale_fill_gradient2(
high = 'dodgerblue4',
mid = 'white',
low = 'firebrick2'
) + theme(axis.text.x = element_text(angle=45)) + ggtitle("Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Model fit: Reduced Model")
plot <- ggarrange(g1,g2, ncol=1, nrow=2);anova(fit_mod)Chi-Squared Test Statistic (unscaled)
Df AIC BIC Chisq Chisq diff Df diff Pr(>Chisq)
Saturated 0 0.00
Model 80 8651.9 8797.8 223.99 223.99 80 0.000000000000001443 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit_mod_1)Chi-Squared Test Statistic (unscaled)
Df AIC BIC Chisq Chisq diff Df diff Pr(>Chisq)
Saturated 0 0.00
Model 82 8666.1 8804.6 242.15 242.15 82 < 0.00000000000000022 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit_mod, fit_mod_1)
Chi-Squared Difference Test
Df AIC BIC Chisq Chisq diff RMSEA Df diff Pr(>Chisq)
fit_mod 80 8651.9 8797.8 223.99
fit_mod_1 82 8666.1 8804.6 242.15 18.153 0.16893 2 0.0001143 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Structural Equation Models
model <- "
# Measurement model
SUP_Parents =~ sup_parents_p1 + sup_parents_p2 + sup_parents_p3
SUP_Friends =~ sup_friends_p1 + sup_friends_p2 + sup_friends_p3
SE_Academic =~ se_acad_p1 + se_acad_p2 + se_acad_p3
SE_Social =~ se_social_p1 + se_social_p2 + se_social_p3
LS =~ ls_p1 + ls_p2 + ls_p3
# Structural model
# Regressions
LS ~ SE_Academic + SE_Social + SUP_Parents + SUP_Friends
# Residual covariances
SE_Academic ~~ SE_Social
"
fit_mod_sem <- sem(model, data = df)modelplot(fit_mod_sem)semPlot::semPaths(fit_mod_sem, whatLabels = 'std', intercepts = FALSE)heat_df = data.frame(resid(fit_mod_sem, type = "standardized")$cov)
heat_df = heat_df |> as.matrix() |> melt()
colnames(heat_df) <- c("x", "y", "value")
heat_df |> mutate(value = round(value, 2)) |> ggplot(aes(x = x, y = y, fill = value)) +
geom_tile() + geom_text(aes(label = value), color = "black", size = 4) +
scale_fill_gradient2(
high = 'dodgerblue4',
mid = 'white',
low = 'firebrick2'
) + theme(axis.text.x = element_text(angle=45)) + ggtitle("Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Model fit")Confirmatory Factor Models with PyMC
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pytensor import tensor as pt
import arviz as az
import networkx as nx
np.random.seed(150)
df_p = pd.read_csv('IIS.dat', sep='\s+')
df_p.head() PI AD IGC FI FC
0 4.00 3.38 4.67 2.6 3.2
1 2.57 3.00 3.50 2.4 2.8
2 2.29 3.29 4.83 2.0 3.4
3 2.43 3.63 4.33 3.6 3.8
4 3.00 4.00 4.83 3.4 3.8
coords = {'obs': list(range(len(df_p))),
'indicators': ['PI', 'AD', 'IGC', 'FI', 'FC'],
'indicators_1': ['PI', 'AD', 'IGC'],
'indicators_2': ['FI', 'FC'],
'latent': ['Student', 'Faculty']
}
obs_idx = list(range(len(df_p)))
with pm.Model(coords=coords) as model:
Psi = pm.InverseGamma('Psi', 5, 10, dims='indicators')
lambdas_ = pm.Normal('lambdas_1', 1, 10, dims=('indicators_1'))
lambdas_1 = pm.Deterministic('lambdas1', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_1'))
lambdas_ = pm.Normal('lambdas_2', 1, 10, dims=('indicators_2'))
lambdas_2 = pm.Deterministic('lambdas2', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_2'))
tau = pm.Normal('tau', 3, 10, dims='indicators')
kappa = 0
sd_dist = pm.Exponential.dist(1.0, shape=2)
chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=2, eta=2,
sd_dist=sd_dist, compute_corr=True)
ksi = pm.MvNormal('ksi', kappa, chol=chol, dims=('obs', 'latent'))
m1 = tau[0] + ksi[obs_idx, 0]*lambdas_1[0]
m2 = tau[1] + ksi[obs_idx, 0]*lambdas_1[1]
m3 = tau[2] + ksi[obs_idx, 0]*lambdas_1[2]
m4 = tau[3] + ksi[obs_idx, 1]*lambdas_2[0]
m5 = tau[4] + ksi[obs_idx, 1]*lambdas_2[1]
mu = pm.Deterministic('mu', pm.math.stack([m1, m2, m3, m4, m5]).T)
_ = pm.Normal('likelihood', mu, Psi, observed=df_p.values)
idata = pm.sample(nuts_sampler='numpyro', target_accept=.95,
idata_kwargs={"log_likelihood": True})
idata.extend(pm.sample_posterior_predictive(idata))
summary_df = az.summary(idata, var_names=['lambdas1', 'lambdas2', 'tau', 'Psi', 'ksi', 'chol_cov_corr'], coords= {'obs': [0, 7]})py$summary_df |> kable() |> kable_classic(full_width = F, html_font = "Cambria")| mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
|---|---|---|---|---|---|---|---|---|---|
| lambdas1[PI] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| lambdas1[AD] | 0.898 | 0.062 | 0.784 | 1.015 | 0.004 | 0.003 | 267 | 524 | 1.02 |
| lambdas1[IGC] | 0.536 | 0.045 | 0.452 | 0.619 | 0.002 | 0.002 | 394 | 799 | 1.02 |
| lambdas2[FI] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| lambdas2[FC] | 0.980 | 0.058 | 0.876 | 1.091 | 0.003 | 0.002 | 443 | 943 | 1.00 |
| tau[PI] | 3.332 | 0.037 | 3.262 | 3.398 | 0.002 | 0.001 | 563 | 1555 | 1.00 |
| tau[AD] | 3.897 | 0.026 | 3.850 | 3.948 | 0.001 | 0.001 | 387 | 1008 | 1.00 |
| tau[IGC] | 4.596 | 0.020 | 4.558 | 4.634 | 0.001 | 0.001 | 666 | 1582 | 1.00 |
| tau[FI] | 3.033 | 0.039 | 2.961 | 3.108 | 0.002 | 0.001 | 476 | 1158 | 1.00 |
| tau[FC] | 3.712 | 0.034 | 3.650 | 3.779 | 0.002 | 0.001 | 374 | 922 | 1.00 |
| Psi[PI] | 0.610 | 0.024 | 0.565 | 0.654 | 0.001 | 0.000 | 1250 | 2084 | 1.00 |
| Psi[AD] | 0.318 | 0.020 | 0.282 | 0.355 | 0.001 | 0.001 | 484 | 1118 | 1.00 |
| Psi[IGC] | 0.355 | 0.013 | 0.330 | 0.380 | 0.000 | 0.000 | 2184 | 2208 | 1.00 |
| Psi[FI] | 0.570 | 0.026 | 0.522 | 0.620 | 0.001 | 0.001 | 1008 | 1854 | 1.00 |
| Psi[FC] | 0.419 | 0.026 | 0.374 | 0.471 | 0.001 | 0.001 | 694 | 1344 | 1.00 |
| ksi[0, Student] | -0.225 | 0.224 | -0.633 | 0.197 | 0.003 | 0.003 | 4431 | 3395 | 1.00 |
| ksi[0, Faculty] | -0.370 | 0.277 | -0.918 | 0.134 | 0.004 | 0.003 | 4265 | 2674 | 1.00 |
| ksi[7, Student] | 0.881 | 0.232 | 0.430 | 1.302 | 0.004 | 0.003 | 3152 | 2768 | 1.00 |
| ksi[7, Faculty] | 0.863 | 0.275 | 0.318 | 1.347 | 0.004 | 0.003 | 3963 | 2620 | 1.00 |
| chol_cov_corr[0, 0] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| chol_cov_corr[0, 1] | 0.850 | 0.028 | 0.797 | 0.905 | 0.001 | 0.001 | 489 | 580 | 1.01 |
| chol_cov_corr[1, 0] | 0.850 | 0.028 | 0.797 | 0.905 | 0.001 | 0.001 | 489 | 580 | 1.01 |
| chol_cov_corr[1, 1] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 3910 | 3911 | 1.00 |
az.plot_trace(idata, var_names=['lambdas1', 'lambdas2', 'tau', 'Psi', 'ksi']);/Users/nathanielforde/mambaforge/envs/pymc_causal/lib/python3.11/site-packages/arviz/stats/density_utils.py:487: UserWarning: Your data appears to have a single value or no finite values
warnings.warn("Your data appears to have a single value or no finite values")
/Users/nathanielforde/mambaforge/envs/pymc_causal/lib/python3.11/site-packages/arviz/stats/density_utils.py:487: UserWarning: Your data appears to have a single value or no finite values
warnings.warn("Your data appears to have a single value or no finite values")
df = pd.read_csv('sem_data.csv')
drivers = ['se_acad_p1', 'se_acad_p2',
'se_acad_p3', 'se_social_p1', 'se_social_p2', 'se_social_p3',
'sup_friends_p1', 'sup_friends_p2', 'sup_friends_p3', 'sup_parents_p1',
'sup_parents_p2', 'sup_parents_p3', 'ls_p1', 'ls_p2', 'ls_p3']
coords = {'obs': list(range(len(df))),
'indicators': drivers,
'indicators_1': ['se_acad_p1','se_acad_p2','se_acad_p3'],
'indicators_2': ['se_social_p1','se_social_p2','se_social_p3'],
'indicators_3': ['sup_friends_p1','sup_friends_p2','sup_friends_p3'],
'indicators_4': [ 'sup_parents_p1','sup_parents_p2','sup_parents_p3'],
'indicators_5': ['ls_p1','ls_p2', 'ls_p3'],
'latent': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS'],
'latent1': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']
}
obs_idx = list(range(len(df)))
with pm.Model(coords=coords) as model:
Psi = pm.InverseGamma('Psi', 5, 10, dims='indicators')
lambdas_ = pm.Normal('lambdas_1', 1, 10, dims=('indicators_1'))
lambdas_1 = pm.Deterministic('lambdas1', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_1'))
lambdas_ = pm.Normal('lambdas_2', 1, 10, dims=('indicators_2'))
lambdas_2 = pm.Deterministic('lambdas2', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_2'))
lambdas_ = pm.Normal('lambdas_3', 1, 10, dims=('indicators_3'))
lambdas_3 = pm.Deterministic('lambdas3', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_3'))
lambdas_ = pm.Normal('lambdas_4', 1, 10, dims=('indicators_4'))
lambdas_4 = pm.Deterministic('lambdas4', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_4'))
lambdas_ = pm.Normal('lambdas_5', 1, 10, dims=('indicators_5'))
lambdas_5 = pm.Deterministic('lambdas5', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_5'))
tau = pm.Normal('tau', 3, 10, dims='indicators')
kappa = 0
sd_dist = pm.Exponential.dist(1.0, shape=5)
chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=5, eta=2,
sd_dist=sd_dist, compute_corr=True)
cov = pm.Deterministic("cov", chol.dot(chol.T), dims=('latent', 'latent1'))
ksi = pm.MvNormal('ksi', kappa, chol=chol, dims=('obs', 'latent'))
m0 = tau[0] + ksi[obs_idx, 0]*lambdas_1[0]
m1 = tau[1] + ksi[obs_idx, 0]*lambdas_1[1]
m2 = tau[2] + ksi[obs_idx, 0]*lambdas_1[2]
m3 = tau[3] + ksi[obs_idx, 1]*lambdas_2[0]
m4 = tau[4] + ksi[obs_idx, 1]*lambdas_2[1]
m5 = tau[5] + ksi[obs_idx, 1]*lambdas_2[2]
m6 = tau[6] + ksi[obs_idx, 2]*lambdas_3[0]
m7 = tau[7] + ksi[obs_idx, 2]*lambdas_3[1]
m8 = tau[8] + ksi[obs_idx, 2]*lambdas_3[2]
m9 = tau[9] + ksi[obs_idx, 3]*lambdas_4[0]
m10 = tau[10] + ksi[obs_idx, 3]*lambdas_4[1]
m11 = tau[11] + ksi[obs_idx, 3]*lambdas_4[2]
m12 = tau[12] + ksi[obs_idx, 4]*lambdas_5[0]
m13 = tau[13] + ksi[obs_idx, 4]*lambdas_5[1]
m14 = tau[14] + ksi[obs_idx, 4]*lambdas_5[2]
mu = pm.Deterministic('mu', pm.math.stack([m0, m1, m2, m3, m4, m5, m6, m7,
m8, m9, m10, m11, m12, m13, m14]).T)
_ = pm.Normal('likelihood', mu, Psi, observed=df[drivers].values)
idata = pm.sample(nuts_sampler='numpyro', target_accept=.95, tune=1000,
idata_kwargs={"log_likelihood": True}, random_seed=100)
idata.extend(pm.sample_posterior_predictive(idata))
summary_df1 = az.summary(idata, var_names=['lambdas1', 'lambdas2', 'lambdas3', 'lambdas4', 'lambdas5', 'tau', 'Psi'])
cov_df = pd.DataFrame(az.extract(idata['posterior'])['cov'].mean(axis=2))
cov_df.index = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']
cov_df.columns = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']
correlation_df = pd.DataFrame(az.extract(idata['posterior'])['chol_cov_corr'].mean(axis=2))
correlation_df.index = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']
correlation_df.columns = ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P', 'LS']py$summary_df1 |> kable() |> kable_classic(full_width = F, html_font = "Cambria")| mean | sd | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat | |
|---|---|---|---|---|---|---|---|---|---|
| lambdas1[se_acad_p1] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| lambdas1[se_acad_p2] | 0.817 | 0.052 | 0.720 | 0.915 | 0.001 | 0.001 | 1193 | 2008 | 1.00 |
| lambdas1[se_acad_p3] | 0.967 | 0.060 | 0.854 | 1.076 | 0.002 | 0.001 | 1286 | 2037 | 1.00 |
| lambdas2[se_social_p1] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| lambdas2[se_social_p2] | 0.965 | 0.058 | 0.856 | 1.071 | 0.002 | 0.002 | 757 | 1634 | 1.00 |
| lambdas2[se_social_p3] | 0.941 | 0.072 | 0.805 | 1.076 | 0.002 | 0.002 | 878 | 1580 | 1.00 |
| lambdas3[sup_friends_p1] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| lambdas3[sup_friends_p2] | 0.802 | 0.044 | 0.720 | 0.887 | 0.001 | 0.001 | 1045 | 1701 | 1.00 |
| lambdas3[sup_friends_p3] | 0.905 | 0.053 | 0.805 | 1.006 | 0.002 | 0.001 | 1235 | 2150 | 1.00 |
| lambdas4[sup_parents_p1] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| lambdas4[sup_parents_p2] | 1.040 | 0.059 | 0.931 | 1.150 | 0.002 | 0.002 | 758 | 1383 | 1.00 |
| lambdas4[sup_parents_p3] | 1.010 | 0.064 | 0.898 | 1.137 | 0.002 | 0.001 | 1051 | 1840 | 1.00 |
| lambdas5[ls_p1] | 1.000 | 0.000 | 1.000 | 1.000 | 0.000 | 0.000 | 4000 | 4000 | NaN |
| lambdas5[ls_p2] | 0.791 | 0.085 | 0.627 | 0.944 | 0.004 | 0.003 | 541 | 1074 | 1.00 |
| lambdas5[ls_p3] | 0.990 | 0.103 | 0.806 | 1.187 | 0.004 | 0.003 | 543 | 878 | 1.00 |
| tau[se_acad_p1] | 5.153 | 0.044 | 5.069 | 5.234 | 0.002 | 0.001 | 488 | 1151 | 1.01 |
| tau[se_acad_p2] | 5.345 | 0.039 | 5.271 | 5.414 | 0.002 | 0.001 | 528 | 1033 | 1.01 |
| tau[se_acad_p3] | 5.209 | 0.045 | 5.127 | 5.297 | 0.002 | 0.001 | 526 | 1290 | 1.01 |
| tau[se_social_p1] | 5.286 | 0.042 | 5.208 | 5.366 | 0.002 | 0.002 | 380 | 743 | 1.01 |
| tau[se_social_p2] | 5.473 | 0.039 | 5.397 | 5.544 | 0.002 | 0.001 | 363 | 742 | 1.01 |
| tau[se_social_p3] | 5.437 | 0.045 | 5.351 | 5.522 | 0.002 | 0.001 | 492 | 982 | 1.00 |
| tau[sup_friends_p1] | 5.782 | 0.068 | 5.651 | 5.904 | 0.004 | 0.003 | 333 | 763 | 1.01 |
| tau[sup_friends_p2] | 6.007 | 0.057 | 5.909 | 6.125 | 0.003 | 0.002 | 397 | 872 | 1.00 |
| tau[sup_friends_p3] | 5.987 | 0.066 | 5.864 | 6.115 | 0.003 | 0.002 | 385 | 890 | 1.01 |
| tau[sup_parents_p1] | 5.973 | 0.061 | 5.858 | 6.085 | 0.003 | 0.002 | 427 | 1059 | 1.00 |
| tau[sup_parents_p2] | 5.925 | 0.062 | 5.807 | 6.040 | 0.003 | 0.002 | 394 | 924 | 1.01 |
| tau[sup_parents_p3] | 5.716 | 0.066 | 5.596 | 5.840 | 0.003 | 0.002 | 470 | 1294 | 1.00 |
| tau[ls_p1] | 5.188 | 0.053 | 5.092 | 5.289 | 0.002 | 0.001 | 654 | 1378 | 1.00 |
| tau[ls_p2] | 5.775 | 0.041 | 5.693 | 5.849 | 0.002 | 0.001 | 716 | 1596 | 1.00 |
| tau[ls_p3] | 5.219 | 0.051 | 5.121 | 5.314 | 0.002 | 0.001 | 666 | 1609 | 1.00 |
| Psi[se_acad_p1] | 0.412 | 0.028 | 0.359 | 0.465 | 0.001 | 0.001 | 1278 | 1740 | 1.00 |
| Psi[se_acad_p2] | 0.413 | 0.024 | 0.367 | 0.456 | 0.001 | 0.000 | 2170 | 2268 | 1.00 |
| Psi[se_acad_p3] | 0.468 | 0.027 | 0.418 | 0.519 | 0.001 | 0.000 | 1844 | 2408 | 1.00 |
| Psi[se_social_p1] | 0.431 | 0.026 | 0.381 | 0.477 | 0.001 | 0.000 | 1382 | 2219 | 1.00 |
| Psi[se_social_p2] | 0.361 | 0.025 | 0.314 | 0.405 | 0.001 | 0.000 | 1486 | 2135 | 1.00 |
| Psi[se_social_p3] | 0.553 | 0.029 | 0.500 | 0.606 | 0.001 | 0.000 | 2594 | 2803 | 1.00 |
| Psi[sup_friends_p1] | 0.517 | 0.040 | 0.439 | 0.587 | 0.001 | 0.001 | 866 | 1739 | 1.00 |
| Psi[sup_friends_p2] | 0.508 | 0.031 | 0.454 | 0.568 | 0.001 | 0.001 | 1420 | 1985 | 1.00 |
| Psi[sup_friends_p3] | 0.625 | 0.036 | 0.562 | 0.694 | 0.001 | 0.001 | 2090 | 2329 | 1.00 |
| Psi[sup_parents_p1] | 0.550 | 0.035 | 0.485 | 0.615 | 0.001 | 0.001 | 1530 | 2075 | 1.00 |
| Psi[sup_parents_p2] | 0.536 | 0.038 | 0.465 | 0.605 | 0.001 | 0.001 | 1192 | 2078 | 1.01 |
| Psi[sup_parents_p3] | 0.675 | 0.038 | 0.602 | 0.745 | 0.001 | 0.001 | 2089 | 2371 | 1.00 |
| Psi[ls_p1] | 0.671 | 0.038 | 0.603 | 0.744 | 0.001 | 0.001 | 1045 | 2387 | 1.00 |
| Psi[ls_p2] | 0.534 | 0.030 | 0.477 | 0.591 | 0.001 | 0.001 | 1409 | 2472 | 1.00 |
| Psi[ls_p3] | 0.622 | 0.035 | 0.554 | 0.688 | 0.001 | 0.001 | 1597 | 2393 | 1.00 |
fig, ax = plt.subplots(figsize=(15, 8))
az.plot_forest(idata, var_names=['lambdas1', 'lambdas2', 'lambdas3', 'lambdas4', 'lambdas5'], combined=True, ax=ax);
ax.set_title("Factor Loadings for each of the Five Factors");py$cov_df |> kable(caption= "Covariances Amongst Latent Factors",digits=2) |> kable_styling() %>% kable_classic(full_width = F, html_font = "Cambria") |> row_spec(5, color = "red")| SE_ACAD | SE_SOCIAL | SUP_F | SUP_P | LS | |
|---|---|---|---|---|---|
| SE_ACAD | 0.47 | 0.26 | 0.06 | 0.20 | 0.22 |
| SE_SOCIAL | 0.26 | 0.39 | 0.19 | 0.26 | 0.30 |
| SUP_F | 0.06 | 0.19 | 1.03 | 0.12 | 0.16 |
| SUP_P | 0.20 | 0.26 | 0.12 | 0.86 | 0.38 |
| LS | 0.22 | 0.30 | 0.16 | 0.38 | 0.42 |
py$correlation_df |> kable( caption= "Correlations Amongst Latent Factors", digits=2) |> kable_styling() |> kable_classic(full_width = F, html_font = "Cambria") |> row_spec(5, color = "red")| SE_ACAD | SE_SOCIAL | SUP_F | SUP_P | LS | |
|---|---|---|---|---|---|
| SE_ACAD | 1.00 | 0.60 | 0.09 | 0.32 | 0.50 |
| SE_SOCIAL | 0.60 | 1.00 | 0.29 | 0.45 | 0.75 |
| SUP_F | 0.09 | 0.29 | 1.00 | 0.12 | 0.25 |
| SUP_P | 0.32 | 0.45 | 0.12 | 1.00 | 0.64 |
| LS | 0.50 | 0.75 | 0.25 | 0.64 | 1.00 |
def make_ppc(idata):
fig, axs = plt.subplots(5, 3, figsize=(20, 20))
axs = axs.flatten()
for i in range(15):
temp = idata['posterior_predictive'].sel({'likelihood_dim_3': i}).mean(dim=('chain', 'draw'))
axs[i].hist(df[drivers[i]], alpha=0.3, ec='black', bins=20, label='Observed Scores')
axs[i].hist(temp['likelihood'], color='purple', alpha=0.6, bins=20, label='Predicted Scores')
axs[i].set_title(f"Posterior Predictive Checks {drivers[i]}")
axs[i].legend();
plt.show()
make_ppc(idata)model_cov = pd.DataFrame(az.extract(idata['posterior_predictive'])['likelihood'][:, :, 0]).cov()
obs_cov = df[drivers].cov()
model_cov.index = obs_cov.index
model_cov.columns = obs_cov.columns
residuals = model_cov - obs_covplot_heatmap(py$residuals, "Residuals of the Sample Covariances and Model Implied Covariances", "A Visual Check of Bayesian Model fit")Structural Equation Modelling in PyMC
def make_sem(priors):
coords = {'obs': list(range(len(df))),
'indicators': drivers,
'indicators_1': ['se_acad_p1','se_acad_p2','se_acad_p3'],
'indicators_2': ['se_social_p1','se_social_p2','se_social_p3'],
'indicators_3': ['sup_friends_p1','sup_friends_p2','sup_friends_p3'],
'indicators_4': [ 'sup_parents_p1','sup_parents_p2','sup_parents_p3'],
'indicators_5': ['ls_p1','ls_p2', 'ls_p3'],
'latent': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P'],
'latent1': ['SE_ACAD', 'SE_SOCIAL', 'SUP_F', 'SUP_P']
}
obs_idx = list(range(len(df)))
with pm.Model(coords=coords) as model:
Psi = pm.InverseGamma('Psi', 5, 10, dims='indicators')
lambdas_ = pm.Normal('lambdas_1', priors['lambda'][0], priors['lambda'][1], dims=('indicators_1'))
lambdas_1 = pm.Deterministic('lambdas1', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_1'))
lambdas_ = pm.Normal('lambdas_2', priors['lambda'][0], priors['lambda'][1], dims=('indicators_2'))
lambdas_2 = pm.Deterministic('lambdas2', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_2'))
lambdas_ = pm.Normal('lambdas_3', priors['lambda'][0], priors['lambda'][1], dims=('indicators_3'))
lambdas_3 = pm.Deterministic('lambdas3', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_3'))
lambdas_ = pm.Normal('lambdas_4', priors['lambda'][0], priors['lambda'][1], dims=('indicators_4'))
lambdas_4 = pm.Deterministic('lambdas4', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_4'))
lambdas_ = pm.Normal('lambdas_5', priors['lambda'][0], priors['lambda'][1], dims=('indicators_5'))
lambdas_5 = pm.Deterministic('lambdas5', pt.set_subtensor(lambdas_[0], 1), dims=('indicators_5'))
tau = pm.Normal('tau', 3, 10, dims='indicators')
kappa = 0
sd_dist = pm.Exponential.dist(1.0, shape=4)
chol, _, _ = pm.LKJCholeskyCov('chol_cov', n=4, eta=priors['eta'],
sd_dist=sd_dist, compute_corr=True)
cov = pm.Deterministic("cov", chol.dot(chol.T), dims=('latent', 'latent1'))
ksi = pm.MvNormal('ksi', kappa, chol=chol, dims=('obs', 'latent'))
beta_r = pm.Normal('beta_r', 0, 1, dims='latent')
regression = pm.Deterministic('regr', beta_r[0]*ksi[obs_idx, 0] + beta_r[1]*ksi[obs_idx, 1] +
beta_r[2]*ksi[obs_idx, 2] + beta_r[3]*ksi[obs_idx, 3])
m0 = tau[0] + ksi[obs_idx, 0]*lambdas_1[0]
m1 = tau[1] + ksi[obs_idx, 0]*lambdas_1[1]
m2 = tau[2] + ksi[obs_idx, 0]*lambdas_1[2]
m3 = tau[3] + ksi[obs_idx, 1]*lambdas_2[0]
m4 = tau[4] + ksi[obs_idx, 1]*lambdas_2[1]
m5 = tau[5] + ksi[obs_idx, 1]*lambdas_2[2]
m6 = tau[6] + ksi[obs_idx, 2]*lambdas_3[0]
m7 = tau[7] + ksi[obs_idx, 2]*lambdas_3[1]
m8 = tau[8] + ksi[obs_idx, 2]*lambdas_3[2]
m9 = tau[9] + ksi[obs_idx, 3]*lambdas_4[0]
m10 = tau[10] + ksi[obs_idx, 3]*lambdas_4[1]
m11 = tau[11] + ksi[obs_idx, 3]*lambdas_4[2]
m12 = tau[12] + regression*lambdas_5[0]
m13 = tau[13] + regression*lambdas_5[1]
m14 = tau[14] + regression*lambdas_5[2]
mu = pm.Deterministic('mu', pm.math.stack([m0, m1, m2, m3, m4, m5, m6, m7,
m8, m9, m10, m11, m12, m13, m14]).T)
_ = pm.Normal('likelihood', mu, Psi, observed=df[drivers].values)
idata = pm.sample(chains=6, nuts_sampler='numpyro', target_accept=.95, tune=1000,
idata_kwargs={"log_likelihood": True}, random_seed=100)
idata.extend(pm.sample_posterior_predictive(idata))
return model, idata
model1, idata1 = make_sem(priors={'eta': 2, 'lambda': [1, 2]})az.plot_posterior(idata1, var_names=['beta_r']);make_ppc(idata1)Citation
@online{forde,
author = {Nathaniel Forde},
title = {Measurement, {Latent} {Factors} and {Theory} {Construction}},
langid = {en}
}